from IPython.core.display import display, HTML
display(HTML("""<style> .container {width:96% !important;}</style>"""))
from IPython.display import IFrame
import pandas as pd
import numpy as np
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode()
cf.go_offline()
path_data = '../large_data_files/ASA_loan_data/'
df = pd.read_csv(path_data + 'SBAnational.csv', low_memory=False)
print df.shape
df = df[df.ApprovalFY != '1976A']
df['ApprovalFY'] = df.ApprovalFY.astype(int)
df = df[(df.ApprovalFY >= 1990) & (pd.notnull(df.MIS_Status))]
print df.shape
df.head()
def to_float(x):
x = x.replace('$', '')
x = x.replace(',', '')
x = float(x)
return x
df['ChgOffPrinGr'] = df['ChgOffPrinGr'].apply(to_float)
df['GrAppv'] = df['GrAppv'].apply(to_float)
df['SBA_Appv'] = df['SBA_Appv'].apply(to_float)
df.head()
default.head()
df.MIS_Status.value_counts()
df.ApprovalFY.value_counts().sort_index()
df.groupby('ApprovalFY').count().max(1).iplot(kind = 'bar', title = 'Loan cases')
(df[df.MIS_Status == 'CHGOFF'].groupby('ApprovalFY').count().max(1) /
df.groupby('ApprovalFY').count().max(1)).iplot(kind = 'bar', title = 'Default rate')
default = df[df.MIS_Status == 'CHGOFF'].copy()
default.groupby('ApprovalFY').SBA_Appv.sum().iplot(kind = 'bar')
default.head()
case = pd.read_csv(path_data + 'SBAcase_11_13_17.csv', low_memory=False)
print(df.shape,
case.shape)
df.head()
case.head()
case.LoanNr_ChkDgt.isin(df.LoanNr_ChkDgt).sum()
df.LoanNr_ChkDgt.isin(case.LoanNr_ChkDgt).sum()
[col for col in case.columns if col not in df.columns]
[col for col in df.columns if col not in case.columns]